# Helper functions
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Compute K stats and ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
my.kstats <- function(NMFexperiment){
# calc different k stats
NMFexperiment <- computeFrobErrorStats(NMFexperiment)
NMFexperiment <- computeSilhoutteWidth(NMFexperiment)
NMFexperiment <- computeCopheneticCoeff(NMFexperiment)
NMFexperiment <- computeAmariDistances(NMFexperiment)
return(NMFexperiment)
}
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot K stats ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
my.plotKstats <- function(NMFexperiment, title){
# visualize k stats
gg.optKr <- plotKStats(NMFexperiment)
gg.optKr <- gg.optKr + theme_bw() +
ggtitle(title) +
theme(plot.title=element_text(hjust=0.5))
return(gg.optKr)
}
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot H matrix heatmap ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
my.plotHMatrices <- function(NMFexperiment, heat.anno, col=viridis(n=100)){
for(ki in names(NMFexperiment@HMatrixList)) {
cat("\n")
cat(" \n#### H matrix for k=", ki, " \n ")
#plot H matrix
tmp.hmatrix <- HMatrix(NMFexperiment, k = ki)
colnames(tmp.hmatrix) <- colnames(NMFexperiment)
h.heatmap <- Heatmap(tmp.hmatrix,
col = col,
name = "Exposure",
clustering_distance_columns = 'pearson',
show_column_dend = TRUE,
heatmap_legend_param =
list(color_bar = "continuous", legend_height=unit(2, "cm")),
top_annotation = heat.anno,
show_column_names = TRUE,
show_row_names = FALSE,
cluster_rows = FALSE)
print(h.heatmap)
}
}
#my.plotHMatrices(rna.norm.nmf.exp, heat.anno)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Recovery plots functions ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
auc <- function(rnk.list,max=NULL) {
aux = sapply(rnk.list,function(rnk) {
if (is.null(max)) {max = max(rnk)}
rnk = sort(rnk)
X = 0
i = 1
ngenes = length(rnk)
while ((rnk[i] <= max) && (i <= length(rnk))) {X = X + max -rnk[i];i = i+1}
rauc = X/(i-1)/max
return(rauc)
})
return(aux)
}
roc <- function(rnk.list,max=NULL,title=NULL) {
require(RColorBrewer)
col = brewer.pal(length(rnk.list),'Set1')
rnk = c(1,rnk.list[[1]])
if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
plot(rnk,(1:length(rnk))/length(rnk),type='s',col=col[1],lwd=3,main=title,ylab='',xlab='Ranks')
for (i in 2:length(rnk.list)) {
rnk = c(1,rnk.list[[i]])
if (is.null(max)) {max = max(rnk)} else {rnk=c(rnk,max)}
lines(rnk,(1:length(rnk))/length(rnk),type='s',col=col[i],lwd=3)
}
L = length(rnk.list[[1]])
abline(1/L,(1-1/L)/(max),lty=2,lwd=2,col='darkgrey')
legend('bottomright',legend = names(rnk.list),col=col,lwd=3)
}
recovery_plot <- function(h, annot, annotID){
which.a = annotID
annot.factor <- annot[,annotID]
n.samples = nrow(annot)
ALL.RNKS = lapply(levels(annot.factor),function(l) {
RNKS=lapply(1:nrow(h),function(i) {
exp = sort(h[i,],decreasing=TRUE)
i.rnk = match(rownames(annot)[annot.factor==l],names(exp))
i.rnk = sort(i.rnk[!is.na(i.rnk)])
return(i.rnk)
})
names(RNKS) = paste0('Sig ',1:length(RNKS))
return(RNKS)
})
names(ALL.RNKS) = levels(annot.factor)
AUC.RAND = lapply(ALL.RNKS,function(r) {
do.call('rbind',lapply(r, function(x) {
##
l = lapply(1:500,function(i) {
sample(1:n.samples,length(x))
})
aux = auc(l,max=n.samples)
return(c(mean(aux),sd(aux)))
}))
})
AUC = lapply(ALL.RNKS,auc,max=n.samples)
PVAL = lapply(1:length(AUC),function(i) {
x = data.frame(AUC.RAND[[i]],AUC[[i]])
colnames(x) = c('mean','sd','val')
z = (x[,3]-x[,1])/x[,2]
p = ifelse(z>0,pnorm(z,lower.tail=FALSE),pnorm(z))
x$z = z
x$p = p
return(x)
})
names(PVAL) = names(AUC)
for (n in names(ALL.RNKS)) {
cat("\n")
cat(" \n##### ", n, " \n ")
#print(n)
RNKS = ALL.RNKS[[n]]
names(RNKS) = paste0(names(RNKS),' - Pval = ',sprintf('%.1e',PVAL[[n]][,5]))
roc(RNKS,max=n.samples,title=paste0(annotID,' - level : ',n))
}
}
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot H matrix heatmap integrative NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
plotH_oneView <- function(int_nmf, viewID, k, heat.anno, col=viridis(100),
scale_color=TRUE, displayID = viewID){
# Find which iteration returned the best results
k <- paste0("k", k)
top_idx <- int_nmf@best_factorization_idx
idx <- top_idx[match(k, names(top_idx))]
sharedH <- int_nmf@shared_HMatrix_list[[k]][[idx]]
Hview <- int_nmf@view_specific_HMatrix_list[[k]][[idx]][[viewID]]
# Define total H matrix
totalH <- sharedH + Hview
# Color Function
if (scale_color) {
colf <- circlize::colorRamp2(seq(0, max(totalH), length.out = 100), col)
} else {
colf <- col
}
main_hist <- hclust(as.dist(1 - cor(totalH, method = "pearson")))
tH.heatmap <- Heatmap(totalH,
col = colf,
name = "Total Exposure",
column_title = "Total H matrix",
cluster_columns = main_hist,
show_column_dend = TRUE,
top_annotation = heat.anno,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE)
sH.heatmap <- Heatmap(sharedH,
col = colf,
name = "Shared Exposure",
column_title = "Shared H matrix",
cluster_columns = main_hist,
show_column_dend = TRUE,
top_annotation = heat.anno,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE)
vH.heatmap <- Heatmap(Hview,
col = colf,
name = "View Specific Exposure",
column_title = "View specific H matrix",
cluster_columns = main_hist,
show_column_dend = TRUE,
top_annotation = heat.anno,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE)
#print(tH.heatmap + sH.heatmap + vH.heatmap)
ht_global_opt(heatmap_legend_grid_height = unit(.25, "cm"))
ht_list <- tH.heatmap + sH.heatmap + vH.heatmap
draw(ht_list, row_title = displayID)
ht_global_opt(RESET = TRUE)
}
Read normalized gene expression matrix…
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Read normalized data ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
rna.norm.mat <- readRDS("data/rnaseq/rnaseq_normalized_counts.RDS")
rna.annotation <- readRDS("data/rnaseq/rnaseq_annotation.RDS")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Print dataset dimension ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
cat("Dimension of transcriptome dataset (RNAseq): \n\n ")
Dimension of transcriptome dataset (RNAseq):
kable(data.frame(dim(rna.norm.mat), row.names = c("features", "samples")),
col.names = "")
| features | 21811 |
| samples | 45 |
Applying Non-Negative Matrix Factorization (NMF) to normalized transcriptome data (RNAseq)
Factorization parameters:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Parameters to run NMF in GPUs using pythonCuda ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 10
inner.iter <- 2*10^4
kable(data.frame(c(k.min, k.max, outer.iter, inner.iter),
row.names = c("k.min", "k.max", "outer.iter", "inner.iter")),
col.names = "")
| k.min | 8 |
| k.max | 10 |
| outer.iter | 10 |
| inner.iter | 20000 |
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Create nmf experiment object and run NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.nmf.exp <- nmfExperimentFromMatrix(matrix = rna.norm.mat)
rna.nmf.exp <- runNMFtensor(rna.nmf.exp,
k.min = k.min,
k.max = k.max,
outer.iter = outer.iter,
inner.iter = inner.iter,
conver.test.stop.threshold = 40)
## [1] "2019-04-02 10:15:40 UTC"
## Factorization rank: 8
## Iteration: 10
## [1] "NMF converged after 253,131,118,268,199,215,228,189,275,356 iterations"
## [1] "2019-04-02 10:16:04 UTC"
## Factorization rank: 9
## Iteration: 10
## [1] "NMF converged after 270,159,161,192,165,201,306,203,136,149 iterations"
## [1] "2019-04-02 10:16:21 UTC"
## Factorization rank: 10
## Iteration: 10
## [1] "NMF converged after 163,215,266,267,240,125,135,155,229,270 iterations"
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Normalize NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.norm.nmf.exp <- normalizeW(rna.nmf.exp)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## K stats and normalization ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
rna.norm.nmf.exp <- my.kstats(rna.norm.nmf.exp)
Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot K stats ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(rna.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate river plot ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(rna.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)
Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## H matrix heatmap annotation ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(rna.annotation$color[match(levels(rna.annotation$Celltype), rna.annotation$Celltype)],
levels(rna.annotation$Celltype))
)
# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = rna.annotation$Celltype),
col = type.colVector,
show_annotation_name = TRUE, na_col = "white")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate H matrix heatmap, W normalized ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in names(rna.norm.nmf.exp@HMatrixList)) {
cat("\n")
cat(" \n#### H matrix for k=", ki, " {.tabset} \n ")
#plot H matrix
tmp.hmatrix <- HMatrix(rna.norm.nmf.exp, k = ki)
colnames(tmp.hmatrix) <- colnames(rna.norm.nmf.exp)
h.heatmap <- Heatmap(tmp.hmatrix,
col = viridis(100),
name = "Exposure",
clustering_distance_columns = 'pearson',
show_column_dend = TRUE,
heatmap_legend_param =
list(color_bar = "continuous", legend_height=unit(2, "cm")),
top_annotation = heat.anno,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE)
print(h.heatmap)
cat(" \n Recovery plots for k=", ki, " \n ")
recovery_plot(tmp.hmatrix, rna.annotation, "Celltype")
}
Recovery plots for k= 8
Recovery plots for k= 9
Recovery plots for k= 10
Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms
The optimal factorization rank selected was: K = 9
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## W matrix Z scores ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rna.wmatrix <- WMatrix(rna.norm.nmf.exp, k = 9)
rownames(rna.wmatrix) <- rownames(rna.norm.nmf.exp)
#Zscore for each signature
rna.wmatrix.zscores <- apply(rna.wmatrix, MARGIN=2, function(wmat_score){
(wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## GAGE (Generally Applicable Gene-set Enrichment) analysis ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)
#run GAGE analysis
rna.msigDB.enrichment <- gage(rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)
#Drop NAs for upregulated
rna.msigDB.enrichment <- as.data.frame(rna.msigDB.enrichment$greater)
rna.msigDB.enrichment <- rna.msigDB.enrichment[!is.na(rna.msigDB.enrichment$p.geomean),]
rna.msigDB.enrichment <- rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]
# Select only more enriched terms in one signature compared to the others
idx <- apply(rna.msigDB.enrichment, 1, function(term){
term <- -log10(term)
# Change 0 to small value to avoid NAs
term[term == 0] <- 1e-40
# find if this term is more enriched in one signature compared to others
is.enrich <- sapply(term, function(x){
# p-value 5 times greater than at least 5 other signatures
sum(x/term > 5) > 5
})
any(is.enrich)
})
rna.msigDB.enrichment <- rna.msigDB.enrichment[idx,]
# Print table
datatable(rna.msigDB.enrichment, filter="top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = list(list(extend = 'collection',
buttons = c('excel', 'csv'),
text = 'DOWNLOAD DATA')))) %>%
formatSignif(columns=colnames(rna.msigDB.enrichment), digits=3)
Read normalized chromatin accessibility matrix…
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Read normalized data ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
atac.norm.mat <- readRDS("data/atacseq/atacseq_normalized_counts.RDS")
atac.annotation <- readRDS("data/atacseq/atacseq_annotation.RDS")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Print dataset dimension ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
cat("Dimension of Chromatin accessibility dataset (ATACseq): \n\n ")
Dimension of Chromatin accessibility dataset (ATACseq):
kable(data.frame(dim(atac.norm.mat), row.names = c("features", "samples")),
col.names = "")
| features | 123102 |
| samples | 68 |
Applying Non-Negative Matrix Factorization (NMF) to normalized Chromatin accessibility data (ATACseq)
Factorization parameters:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Parameters to run NMF in GPUs using pythonCuda ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4
kable(data.frame(c(k.min, k.max, outer.iter, inner.iter),
row.names = c("k.min", "k.max", "outer.iter", "inner.iter")),
col.names = "")
| k.min | 8 |
| k.max | 10 |
| outer.iter | 5 |
| inner.iter | 20000 |
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Create nmf experiment object and run NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.nmf.exp <- nmfExperimentFromMatrix(matrix = atac.norm.mat)
atac.nmf.exp <- runNMFtensor(atac.nmf.exp,
k.min = k.min,
k.max = k.max,
outer.iter = outer.iter,
inner.iter = inner.iter,
conver.test.stop.threshold = 40)
## [1] "2019-04-02 10:18:06 UTC"
## Factorization rank: 8
## [1] "NMF converged after 196,195,123,307,301 iterations"
## [1] "2019-04-02 10:18:50 UTC"
## Factorization rank: 9
## [1] "NMF converged after 437,255,492,210,155 iterations"
## [1] "2019-04-02 10:19:58 UTC"
## Factorization rank: 10
## [1] "NMF converged after 248,190,161,202,234 iterations"
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Normalize NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
atac.norm.nmf.exp <- normalizeW(atac.nmf.exp)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## K stats and normalization ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Estimate K stats
atac.norm.nmf.exp <- my.kstats(atac.norm.nmf.exp)
Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot K stats ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
gg.optKr <- my.plotKstats(atac.norm.nmf.exp, "NMF factorization quality metrics")
gg.optKr
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate river plot ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
river <- generateRiverplot(atac.norm.nmf.exp)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)
Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## H matrix heatmap annotation ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(atac.annotation$color[match(levels(atac.annotation$Celltype), atac.annotation$Celltype)],
levels(atac.annotation$Celltype))
)
# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = atac.annotation$Celltype),
col = type.colVector,
show_annotation_name = TRUE, na_col = "white")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate H matrix heatmap, W normalized ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in names(atac.norm.nmf.exp@HMatrixList)) {
cat("\n")
cat(" \n#### H matrix for k=", ki, " {.tabset} \n ")
#plot H matrix
tmp.hmatrix <- HMatrix(atac.norm.nmf.exp, k = ki)
colnames(tmp.hmatrix) <- colnames(atac.norm.nmf.exp)
h.heatmap <- Heatmap(tmp.hmatrix,
col = viridis(100),
name = "Exposure",
clustering_distance_columns = 'pearson',
show_column_dend = TRUE,
heatmap_legend_param =
list(color_bar = "continuous", legend_height=unit(2, "cm")),
top_annotation = heat.anno,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE)
print(h.heatmap)
cat(" \n Recovery plots for k=", ki, " \n ")
recovery_plot(tmp.hmatrix, atac.annotation, "Celltype")
}
Recovery plots for k= 8
Recovery plots for k= 9
Recovery plots for k= 10
gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 11096612 592.7 18696788 998.6 18696788 998.6 Vcells 73660253 562.0 131581271 1003.9 131581269 1003.9
Gene expression (RNAseq) & Chromatin accessibility (ATACseq)
Only the those samples with RNAseq and ATACseq data were used in the integrative analysis.
Read normalized gene expression matrix and chromatin accessibility matrix…
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Read normalized data ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
# read normalized matrix
int.norm.mat <- readRDS("data/multiview/multiview_norm_mat_list.RDS")
int.annotation <- readRDS("data/multiview/multiview_annotation.RDS")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Print dataset dimension ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
cat("Dimension of transcriptome dataset (RNAseq): \n\n ")
Dimension of transcriptome dataset (RNAseq):
kable(data.frame(dim(int.norm.mat$rna), row.names = c("features", "samples")),
col.names = "")
| features | 21811 |
| samples | 24 |
cat("Dimension of Chromatin accessibility dataset (ATACseq): \n\n ")
Dimension of Chromatin accessibility dataset (ATACseq):
kable(data.frame(dim(int.norm.mat$atac), row.names = c("features", "samples")),
col.names = "")
| features | 123102 |
| samples | 24 |
Applying Integrative Non-Negative Matrix Factorization (NMF) to normalized Gene expression (RNAseq) and Chromatin accessibility data (ATACseq)
Factorization parameters:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Parameters to run NMF in GPUs using pythonCuda ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
k.min <- 8
k.max <- 10
outer.iter <- 5
inner.iter <- 2*10^4
kable(data.frame(c(k.min, k.max, outer.iter, inner.iter),
row.names = c("k.min", "k.max", "outer.iter", "inner.iter")),
col.names = "")
| k.min | 8 |
| k.max | 10 |
| outer.iter | 5 |
| inner.iter | 20000 |
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Create nmf experiment object and run NMF ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
int.nmf.exp <- run_integrative_NMF_tensor(int.norm.mat,
k_min = k.min,
k_max = k.max,
outer_iter = outer.iter,
inner_iter = inner.iter,
conver_stop_threshold = 40,
lambda=0.7)
## Running integrative NMF for views: rna,atac
## [1] "2019-04-02 10:22:38 UTC"
## Factorization rank: 8
## [1] "NMF converged after 158,240,140,122,208 iterations"
## [1] "2019-04-02 10:23:11 UTC"
## Factorization rank: 9
## [1] "NMF converged after 142,220,405,191,108 iterations"
## [1] "2019-04-02 10:23:47 UTC"
## Factorization rank: 10
## [1] "NMF converged after 180,141,179,149,182 iterations"
Based on the results of the factorization quality metrics, an optimal number of signatures (k) must be chosen:
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## RNAseq - Plot K stats - River plot ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$rna, "RNAseq factorization quality metrics")
gg.optKr
## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$rna)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## ATACseq - Plot K stats - River plot ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Plot K stats
gg.optKr <- my.plotKstats(int.nmf.exp@view_specific_NMFexp_list$atac, "ATACseq factorization quality metrics")
gg.optKr
## Generate river plot
river <- generateRiverplot(int.nmf.exp@view_specific_NMFexp_list$atac)
plot(river, plot_area=1, yscale=0.6, nodewidth=0.5)
Minize the Frobenius error, the coefficient of variation and the mean Amari distance, while maximizing the sum and mean silhouette width and the cophenic coefficient.
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## H matrix heatmap annotation ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Annotation for H matrix heatmap
# Define color vector
type.colVector <- list(Celltype = setNames(int.annotation$color[match(levels(int.annotation$Celltype), int.annotation$Celltype)],
levels(int.annotation$Celltype))
)
# Build Heatmap annotation
heat.anno <- HeatmapAnnotation(df = data.frame(Celltype = int.annotation$Celltype),
col = type.colVector,
show_annotation_name = FALSE, na_col = "white")
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate H matrix heatmap, W normalized ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
for(ki in k.min:k.max) {
cat("\n")
#cat(" \n#### H matrix for k=", ki, " {.tabset} \n ")
cat(" \n#### H matrix for k=", ki, " {.tabset} \n ")
#plot H matrix
cat("\n")
cat(" \n##### not scaled \n ")
plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno,
scale_color = FALSE, viewID = "rna", displayID = "RNAseq")
plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno,
scale_color = FALSE, viewID = "atac", displayID = "ATACseq")
cat("\n")
cat(" \n##### scaled \n ")
plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno,
scale_color = TRUE, viewID = "rna", displayID = "RNAseq")
plotH_oneView(int.nmf.exp, k = ki, heat.anno = heat.anno,
scale_color = TRUE, viewID = "atac", displayID = "ATACseq")
}
Using the feature exposure extracted from the W matrix, a gene set enrichment analysis is perform agains all MSigDB terms
The optimal factorization rank selected was: K = 9
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## W matrix Z scores ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
idx <- int.nmf.exp@best_factorization_idx
idx <- idx[names(idx) == "k9"]
int.rna.wmatrix <- int.nmf.exp@view_specific_WMatrix_list$k9[[idx]]$rna
rownames(int.rna.wmatrix) <- rownames(int.norm.mat$rna)
#Zscore for each signature
int.rna.wmatrix.zscores <- apply(int.rna.wmatrix, MARGIN=2, function(wmat_score){
(wmat_score - median(wmat_score)) / mad(wmat_score)
})
colnames(int.rna.wmatrix.zscores) <- paste0("Zscore_Sign", 1:9)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## GAGE (Generally Applicable Gene-set Enrichment) analysis ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
#Infer gene sets tha are significantly pertubed relative to all genes considered
#load precompiled GSEA MSigDB gene sets
gs.msigDB <- readList("db/msigdb.v6.2.symbols.gmt")
# head(gs.msigDB)
#run GAGE analysis
int.rna.msigDB.enrichment <- gage(int.rna.wmatrix.zscores, gsets=gs.msigDB, same.dir=TRUE)
#Drop NAs for upregulated
int.rna.msigDB.enrichment <- as.data.frame(int.rna.msigDB.enrichment$greater)
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[!is.na(int.rna.msigDB.enrichment$p.geomean),]
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[, paste0("Zscore_Sign", 1:9)]
# Select only more enriched terms in one signature compared to the others
idx <- apply(int.rna.msigDB.enrichment, 1, function(term){
term <- -log10(term)
# Change 0 to small value to avoid NAs
term[term == 0] <- 1e-40
# find if this term is more enriched in one signature compared to others
is.enrich <- sapply(term, function(x){
# p-value 5 times greater than at least 5 other signatures
sum(x/term > 5) > 5
})
any(is.enrich)
})
int.rna.msigDB.enrichment <- int.rna.msigDB.enrichment[idx,]
# Print table
datatable(int.rna.msigDB.enrichment, filter="top",
extensions = 'Buttons',
options = list(dom = 'Bfrtip',
buttons = list(list(extend = 'collection',
buttons = c('excel', 'csv'),
text = 'DOWNLOAD DATA')))) %>%
formatSignif(columns=colnames(int.rna.msigDB.enrichment), digits=3)
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
## Generate H matrix recovery plots ##
##––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––##
rownames(int.annotation) <- int.annotation$sampleID
int.annotation$Celltype <- factor(int.annotation$Celltype, levels = unique(int.annotation$Celltype))
for(ki in k.min:k.max) {
cat("\n")
#cat(" \n#### H matrix for k=", ki, " {.tabset} \n ")
cat(" \n### Recovery plots for k=", ki, " {.tabset} \n ")
#plot H matrix
# Find best shared matrix, according to factorization metrics
k <- paste0("k", ki)
top_idx <- int.nmf.exp@best_factorization_idx
idx <- top_idx[match(k, names(top_idx))]
cat("\n")
cat(" \n#### Shared H Matrix \n ")
sharedH <- int.nmf.exp@shared_HMatrix_list[[k]][[idx]]
colnames(sharedH) <- int.annotation$sampleID
# Make recovery plots
recovery_plot(sharedH, int.annotation, "Celltype")
cat("\n")
cat(" \n#### RNAseq H Matrix \n ")
viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$rna
colnames(viewH) <- int.annotation$sampleID
# Make recovery plots
recovery_plot(viewH, int.annotation, "Celltype")
cat("\n")
cat(" \n#### ATACseq H Matrix \n ")
viewH <- int.nmf.exp@view_specific_HMatrix_list[[k]][[idx]]$atac
colnames(viewH) <- int.annotation$sampleID
# Make recovery plots
recovery_plot(viewH, int.annotation, "Celltype")
}